# get logcpm values & factorize Batch info
assays(se)$logcpm <- log1p(cpm(calcNormFactors(DGEList(assay(se)))))
se$Batch <- factor(se$Batch)
# separate the 2 cell lines
se.hela <- se[,se$Cell_Line=="HeLa"]
se.hela <- se.hela[rowSums(assay(se.hela) >= sel[1]) >= sel[2],]
se.hek <- se[,se$Cell_Line!="HeLa"]
se.hek <- se.hek[rowSums(assay(se.hek) >= sel[1]) >= sel[2],]Question: should we correct for batch?
# plot PCA
## combined
plgINS::plPCA(assays(se)$logcpm, as.data.frame(colData(se)), colorBy = "Cell_Line",
add.labels = FALSE, annotation_columns = colnames(colData(se))[1:3])## HeLa
p.hela <- plgINS::plPCA(assays(se.hela)$logcpm, as.data.frame(colData(se.hela)),
colorBy = "miRNA", shapeBy = "Batch", add.labels = FALSE,
annotation_columns = colnames(colData(se.hela))[1:3])
add_markers(p.hela, color = ~se.hela$miRNA, colors = "Paired")## HeLa batch effect
plgINS::plPCA(assays(se.hela)$logcpm, as.data.frame(colData(se.hela)), colorBy = "Batch",
add.labels = FALSE, annotation_columns = colnames(colData(se.hela))[1:3])## HEK
p.hek <- plgINS::plPCA(assays(se.hek)$logcpm, as.data.frame(colData(se.hek)),
colorBy = "miRNA", shapeBy = "Batch", add.labels = FALSE,
annotation_columns = colnames(colData(se.hek))[1:3])
add_markers(p.hek, color = ~se.hek$miRNA, colors = "Paired")## HEK batch effect
plgINS::plPCA(assays(se.hek)$logcpm, as.data.frame(colData(se.hek)), colorBy = "Batch",
add.labels = FALSE, annotation_columns = colnames(colData(se.hek))[1:3])Batch correction for HeLa cells: definitely.
Batch correction for HEK cells: maybe not necessary. Let’s compare both DEAs.
#' genCTRL
#'
#' @param se SE object
#' @param logcpm character: name of the logcpm assay contained in supplied SE object
#'
#' @return matrix: control samples generated out of existing samples, one for each batch
#'
genCTRL <- function(se, logcpm){
# e.ctrl <- sapply(unique(se$Batch), ls=min(colSums(assay(se))),
# FUN=function(x,ls){
# rowMedians(exp(assays(se)[[logcpm]][,se$Batch==x])-1) * ls/1000000
# }
# )
e.ctrl <- sapply(unique(se$Batch), ls=colSums(assay(se)),
FUN=function(x,ls){
rowMedians(expm1(assays(se)[[logcpm]][,se$Batch==x])) * median(ls[se$Batch==x])/1000000
}
)
### generate colnames
n.cols <- sapply( unique(se$Batch), FUN=function(x) var_name <- paste("CTRL", x, sep=".") )
colnames(e.ctrl) <- n.cols
rownames(e.ctrl) <- rownames(se)
return(e.ctrl)
}#' bartelDEA
#'
#' Generates control samples out of the average values over all supplied samples,
#' calculates their logCPM & log2FC data and performs DEA over all treatments & individual ones.
#' All is assembled into one SE object.
#'
#'
#' @param se SummarizedExperiment object containing assays of raw counts
#'
#' @return an SE object
#'
bartelDEA <- function(se, model, model0=~1){
se$Batch <- droplevels(se$Batch)
# generate control
## create logcpm assay
assays(se)$logcpm <- log1p(cpm(calcNormFactors(DGEList(assay(se)))))
## generate a 'control' sample out of the median normalized counts over all samples
e.ctrl <- genCTRL(se, "logcpm")
## add control samples to assays & generate DGEList object
dds <- DGEList(cbind(assay(se), e.ctrl))
## generate colData info for combined assay
dd <- rbind(colData(se)[,c("Batch","miRNA","Cell_Line")], data.frame(Batch=unique(se$Batch),
Cell_Line=unique(se$Cell_Line),
miRNA="CTRL"))
dd$miRNA <- relevel(droplevels(dd$miRNA), ref="CTRL")
# DEA
## normalization
dds <- calcNormFactors(dds)
## models
mm <- model.matrix(model, data=dd)
mm0 <- model.matrix(model0, data=dd)
testCoeff <- setdiff(colnames(mm), colnames(mm0))
## estimate dispersion
dds <- estimateDisp(dds,mm)
## fit negative binomial distribution on counts per gene (use glmFit for few replicates)
fit <- glmFit(dds,mm)
## fit linear model on all samples
res <- as.data.frame(topTags(glmLRT(fit, testCoeff),Inf))
## fit linear model dropping one sample at a time (using multiple cores)
res.list <- bplapply( testCoeff, BPPARAM=MulticoreParam(8), FUN=function(x){
as.data.frame(topTags(glmLRT(fit, x), Inf))
})
dea.names <- make.names(gsub("miRNA","", testCoeff))
names(res.list) <- paste0("DEA.",dea.names)
colnames(res)[grepl("logFC",colnames(res))] <- paste0("logFC.", dea.names)
# generate final SE object
## SE object with logCPM & logFC assays & batch-corrected logCPM assays (for visualization)
se <- SummarizedExperiment(assays=list(counts=dds$counts),
rowData=rowData(se),
colData=dd)
assays(se)$logcpm <- log1p(cpm(calcNormFactors(DGEList(assay(se)))))
se <- plgINS::svacor(se, form = model, form0 = model0)
se <- SEtools::log2FC(se, controls = se$miRNA=="CTRL", by = se$readtype, fromAssay = "corrected", isLog = TRUE)
## add DEAs
rowData(se)$DEA.all <- DataFrame(res[rownames(se),])
for(i in paste0("DEA.",dea.names)){
rowData(se)[[i]] <- DataFrame(res.list[[i]][rownames(se),])
}
# output
return(se)
}## Number of significant surrogate variables is: 4
## Iteration (out of 5 ):1 2 3 4 5
## Number of significant surrogate variables is: 2
## Iteration (out of 5 ):1 2 3 4 5
# DEA HEK without taking batch effect into account
se.hek.nb <- se[,se$Cell_Line!="HeLa"]
se.hek.nb <- se.hek.nb[rowSums(assay(se.hek.nb) >= sel[1]) >= sel[2],]
se.hek.nb <- bartelDEA(se.hek.nb, model = ~1+miRNA)## Number of significant surrogate variables is: 3
## Iteration (out of 5 ):1 2 3 4 5
# select significant gene expression changes
degs.hela <- lapply(rowData(se.hela)[grepl("DEA.",colnames(rowData(se.hela)))], FUN=function(x) row.names(x)[x$FDR<0.05])
degs.hek <- lapply(rowData(se.hek)[grepl("DEA.",colnames(rowData(se.hek)))], FUN=function(x) row.names(x)[x$FDR<0.05])
degs.hek.nb <- lapply(rowData(se.hek.nb)[grepl("DEA.",colnames(rowData(se.hek.nb)))], FUN=function(x) row.names(x)[x$FDR<0.05])
# number of significant gene expression changes per sample
## HeLa
sapply(degs.hela,length)## DEA.all DEA.let.7a DEA.lsy.6 DEA.miR.1 DEA.miR.124 DEA.miR.137
## 13872 3088 1817 5895 5111 3608
## DEA.miR.139 DEA.miR.143 DEA.miR.144 DEA.miR.153 DEA.miR.155 DEA.miR.182
## 1242 461 298 2400 2194 2295
## DEA.miR.199a DEA.miR.204 DEA.miR.205 DEA.miR.216b DEA.miR.223 DEA.miR.7
## 1679 946 1207 1394 1705 3565
## DEA.all DEA.miR.122 DEA.miR.133 DEA.miR.138 DEA.miR.145 DEA.miR.184
## 10504 461 587 3100 886 113
## DEA.miR.190a DEA.miR.200b DEA.miR.216a DEA.miR.217 DEA.miR.219a DEA.miR.375
## 748 1166 317 438 418 771
## DEA.miR.451a
## 137
## DEA.all DEA.miR.122 DEA.miR.133 DEA.miR.138 DEA.miR.145 DEA.miR.184
## 8828 289 415 2051 451 84
## DEA.miR.190a DEA.miR.200b DEA.miR.216a DEA.miR.217 DEA.miR.219a DEA.miR.375
## 431 787 223 320 317 513
## DEA.miR.451a
## 90
# comparison of HEK DEA with and without batch effect taken into account
## median number of significant changes: HEK w batch
median(unlist(lapply(degs.hek,length)))## [1] 587
## [1] 415
## which proportion of significant tx do we see in batch vs no batch and vice versa
## e.g. for miR-122 63% of tx found in batch corrected DEA are also found in uncorrected
dea.names <- colnames(rowData(se.hek)[grepl("DEA.",colnames(rowData(se.hek)))])
t(sapply(dea.names, FUN = function(x){
b_in_nb <- sum(degs.hek[[x]] %in% degs.hek.nb[[x]])/length(degs.hek[[x]])
nb_in_b <- sum(degs.hek.nb[[x]] %in% degs.hek[[x]])/length(degs.hek.nb[[x]])
c(b_in_nb=b_in_nb, nb_in_b=nb_in_b)
}))## b_in_nb nb_in_b
## DEA.all 0.8272087 0.9842546
## DEA.miR.122 0.6268980 1.0000000
## DEA.miR.133 0.7069847 1.0000000
## DEA.miR.138 0.6609677 0.9990249
## DEA.miR.145 0.5090293 1.0000000
## DEA.miR.184 0.7433628 1.0000000
## DEA.miR.190a 0.5762032 1.0000000
## DEA.miR.200b 0.6749571 1.0000000
## DEA.miR.216a 0.7034700 1.0000000
## DEA.miR.217 0.7260274 0.9937500
## DEA.miR.219a 0.7559809 0.9968454
## DEA.miR.375 0.6653696 1.0000000
## DEA.miR.451a 0.6569343 1.0000000
We get more significant results with the batch corrected HEK DEA & almost all significant tx of the batch uncorrected HEK DEA are present in the corrected one. We thus choose the batch corrected version.
# load genes to be excluded
skewed.hela <- readRDS("data/bartel.hela.excludedGenes.rds")
skewed.hek <- readRDS("data/bartel.hek.excludedGenes.rds")
# remove genes not present in spliced/unspliced bartel DEA
se.hela <- se.hela[!(rownames(se.hela) %in% skewed.hela),]
se.hek <- se.hek[!(rownames(se.hek) %in% skewed.hek),]# round
source("functions/roundSE.R")
se.hela <- roundSE(se.hela)
# export HeLa
saveRDS(se.hela, output1)# heatmaps: comparing batch uncorrected vs corrected
## HeLa logcpm
sehm(se.hela[,order(se.hela$miRNA)], degs.hela$DEA.miR.143, do.scale = TRUE,
anno_columns = c("miRNA","Batch"), assayName = "logcpm",
show_rownames = FALSE, breaks = TRUE)## HeLa logcpm batch corrected
sehm(se.hela[,order(se.hela$miRNA)], degs.hela$DEA.miR.143, do.scale = TRUE,
anno_columns = c("miRNA","Batch"), assayName = "corrected",
show_rownames = FALSE, breaks = TRUE)## HEK logcpm
sehm(se.hek[,order(se.hek$miRNA)], degs.hek$DEA.miR.184, do.scale = TRUE,
anno_columns = c("miRNA","Batch"), assayName = "logcpm",
show_rownames = FALSE, breaks = TRUE)## HEK logcpm batch corrected
sehm(se.hek[,order(se.hek$miRNA)], degs.hek$DEA.miR.184, do.scale = TRUE,
anno_columns = c("miRNA","Batch"), assayName = "corrected",
show_rownames = FALSE, breaks = TRUE)